{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": true
   },
   "source": [
    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\"><ul class=\"toc-item\"></ul></div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-16T13:56:54.795585Z",
     "start_time": "2021-05-16T13:56:54.786963Z"
    }
   },
   "outputs": [],
   "source": [
    "import re\n",
    "import json\n",
    "\n",
    "no_ascii_char = ['é', '‟', 'ø', 'š', '«', '¸', '…', '\\xad', '¬', 'θ', ';', '\\x9d', '∝', '´', '¾', '€', '÷', '′', '„', 'в', '£', 'â', 'ƞ', 'ã', '≠', 'œ', '¼', '∏', '¢', '🙂', '“', 'с', '’', '∠', 'т', 'π', '¤', '·', '™', '⁄', '�', 'ª', '—', '∪', '•', '₹', '³', '‘', '¥', '½', 'δ', '∩', '²', 'ō', 'ƫ', 'ў', 'º', 'ʼ', 'æ', '∆', '≥', 'е', '×', '¯', '∗', 'ç', '\\x81', '√', 'ï', 'ˆ', '¹', '‚', 'ɵ', '−', 'í', '°', '≤', '–', '♠', '”']\n",
    "\n",
    "def get_number_mapping(input_list):\n",
    "    sym2num = {}\n",
    "    num2sym = {}\n",
    "    for num_i in input_list:\n",
    "        if num_i not in sym2num:\n",
    "            sym_i = 'n{}'.format(len(sym2num))\n",
    "            sym2num[sym_i] = num_i\n",
    "            num2sym[num_i] = sym_i\n",
    "    return sym2num, num2sym\n",
    "\n",
    "def is_ascii(s):\n",
    "    res = []\n",
    "    for c in s:\n",
    "        if ord(c) >= 128:\n",
    "            res.append(c)\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-16T14:08:04.302239Z",
     "start_time": "2021-05-16T14:08:04.266285Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def preprocess(data_version = 'train'):\n",
    "    with open('./{}.json'.format(data_version), 'r') as f:\n",
    "        data = json.load(f)\n",
    "    print(len(data))\n",
    "    with open('./{}.txt'.format(data_version), 'w') as g:\n",
    "        to_write = []\n",
    "        for item in data:\n",
    "            problem = str(item['Problem']).lower()\n",
    "            problem = problem.replace(',', ' , ').replace('(', ' ( ').replace(')', ' ) ').replace('|', ' | ').strip().strip('|').strip()\n",
    "            problem = ' ' + \" \".join(problem.split()) + ' '\n",
    "            \n",
    "            result = str(item['linear_formula']).lower()\n",
    "            result = result.replace(',', ' , ').replace('(', ' ( ').replace(')', ' ) ').replace('|', ' | ').strip().strip('|').strip()\n",
    "            result = ' ' + \" \".join(result.split()) + ' '\n",
    "            numbers = re.findall(r\"\\d+\\.?\\d*\", problem)\n",
    "            sym2num, num2sym = get_number_mapping(numbers)\n",
    "            for num_i in num2sym:\n",
    "                if ' '+num_i+' ' in problem:\n",
    "                    problem = problem.replace(' '+num_i+' ', ' '+num2sym[num_i]+' ')\n",
    "                else:\n",
    "                    raise RuntimeError()\n",
    "            problem = problem.strip()\n",
    "            result = result.strip()\n",
    "            for c in no_ascii_char:\n",
    "                problem = problem.replace(c, 'no_ascii_{}'.format(no_ascii_char.index(c)))\n",
    "                result = result.replace(c, 'no_ascii_{}'.format(no_ascii_char.index(c)))\n",
    "            to_write.append((problem,result))\n",
    "        to_write = sorted(to_write, key=lambda x:len(x[0].split()))\n",
    "        print('writing...')\n",
    "        for data_i in to_write:\n",
    "            g.write(data_i[0]+'\\t'+data_i[1]+'\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-16T14:19:49.253994Z",
     "start_time": "2021-05-16T14:19:43.647254Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "29837\n",
      "writing...\n"
     ]
    }
   ],
   "source": [
    "preprocess('train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-16T14:19:51.212849Z",
     "start_time": "2021-05-16T14:19:49.693098Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4475\n",
      "writing...\n",
      "2985\n",
      "writing...\n"
     ]
    }
   ],
   "source": [
    "preprocess('dev')\n",
    "preprocess('test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-16T14:32:46.953160Z",
     "start_time": "2021-05-16T14:32:46.871938Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "384\n"
     ]
    }
   ],
   "source": [
    "arr_len = []\n",
    "with open('train.txt', 'r') as f:\n",
    "    for l in f:\n",
    "        arr_len.append(len(l.strip().split('\\t')[1].split()))\n",
    "import numpy as np\n",
    "print(np.max(arr_len))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
